Assignment 1

1.1

# keep columns 1,2,5,6,7,9,10,16,17,18,19

p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]

rownames(p_e) <- p_e[[1]]

1.2

Without doing any reordering we cannot identify any clusters or outliers.

#p_e_sc %>% 
  plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
    z = ~p_e_sc, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

1.3

# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")

p_e_cdist <- dist(t(p_e_sc), method = "euclidean")

We used method “OLO” as the Hierarchical Clustering algorithm instead of “HC” method, because according to the documentation in seriation package and @[hahsler] the former does not optimize the Hamiltonian Path Length.

# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))

order2 <-get_order(seriate(p_e_cdist, method = "OLO"))

p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
    z = ~p_reord, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist - HC)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# computing distance as one minus correlation

p_e_cor <- as.dist((1 - cor(p_e_sc))/2)

p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
set.seed(1212)

# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))

ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))

# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
    z = ~p_reord2, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Cor dist)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

The ordering by euclidean distance produces a heat map that is easier to analyze. At first glance we can perceive four general regions of two groups. The first group heat map color tends towards a brighter shade of red while the second group tend towards a darker shade of red/black. Although these groups can be seen in the correlation distance heat map, it is not as clear as the first.

Based on the euclidean distance heat map, net wage tends to higher values from Dubai while the number of hours worked decrease. This is the opposite to cities like Delhi,Bankok and Seoul. Interestingly food costs are generally low in the cities with highee working hours. Caracas is an outlier because food costs are high while net wage and the number of hours worked remains low.

1.4

# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)

ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))

set.seed(111)
order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))

p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
    z = ~p_reord_q4, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist- TSP)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

From visual comparison of the heatmap produced by HC solver and TSP, we prefer the latter because there appears to be a separation along the anti diagonal such that cities that have similary high prices are placed along this diagonal. In the heatmap by the TSP solver this distinctionn is not quite clear.

# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order

# set seed 
set.seed(222)

or1 <- seriate(p_e_rdist, method = "OLO")

set.seed(333)

or2 <- seriate(p_e_rdist, method = "TSP")

result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))

result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
# criterion on HC solver and unordered
result1
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    756120.2      20296.15     26876 19055.56 0.20713342
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          65528         156151.06 24666913         3713.804
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3352459  894638.7 652.4429     411.5392       239.0233
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      121.9671 0.2253186
# criterion on TSP and unordered.
result2
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    838955.9      47893.87     40089 20181.60 0.11204948
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          39102          89163.24 21209322         4540.624
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3441776  939297.2 653.2224     413.0466       237.8996
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      119.0998 0.3360915

TSP solver has shorter path length (121.718) compared to HC solver (121.967) by a very small margin of .249. Thus it does a slighlty better job of optimizing the Hamiltonian Path Length. For measure of effectiveness (ME) the HC solver (ME = 652.44) has advantage over the TSP (ME = 651.598). A higher ME implies a better arrangement of the dissimilarity matrix. For gradient measures since objective is to increase the distance from main diagonal, we infer that HC (65528) does a better job of achieving this compared to TSP solver(41312).

1.5

# parallel coordinates plot from unsorted scaled data 

p_e_sc2 <- as.data.frame(p_e_sc)

p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)

We can identify two clusters defined by Wage net (blue) and iphone 4s (red). Wage net has values greater than 0 in the red cluster (defined by iphone 4) while iphone has values has values greater than -0.5 in the blue cluster. These clusters are difficult to interpret. The most prominent outlier in the blue cluster is the price of Rice per kg, The lines at this variable are furthest apart . It does not seem to fit into any of the two clusters.

1.6

# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
p_ord_HC$name <- rownames(p_ord_HC)
# colapse the p_ord_HC to long format
p_ord_HC %>%
  tidyr::gather(variable, value, -name, factor_key = T) %>%
  arrange(name) %>%
  ggplot(aes(x=variable, y=value, group=name)) + 
  geom_polygon(fill="#F81894") + 
  coord_polar() + theme_classic() + 
  facet_wrap(~ name) #+ 

  #theme(axis.text.x = element_text(size = 5))

Cairo and Delhi are outliers with respect to the price of bread per kg. Prague, Johanessburg and Panama form a cluster the prices are comparatively small and similar across the products.. The most distinct outlier would be Caracas. The prices aååear higher due to the size of the radar chart.

1.7

Among Heatmaps, paralled coordinates and radar charts, heatmaps are relatively easier to interprate in terms of time and accuracy. Radar cahrts are hard to interprate when the number of variables to compare a re high. Parallel coordiantes are generally messy to work with.

Assignment 2

2.1 Scatter and Trellis plots

The points in the scatter plot are very close to each other and so, it is difficult to interpret the results for both <=50K and >50K income levels. The trellis plot shows both income levels in separate panels and so, it is quite easy to analyse the datas. The people with age range of 0-25 who earns <=50K are working for long hours than people earning >50K.

plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
         geom_point(size=1)
plot1

plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
   facet_wrap(~income_level)
plot2

2.2 Density plot

The density curve is skewed to the right (right skewed distribution) for income level <=50K and thus not normally distributed.The median income level for <=50K is less than the mean. Whereas the the density curve for income of >50K is bimodal. The trellis plot exhibits both income level for people with various marital status. The density curve for widower is normally distributed for both income levels. The people who were never married earnig <=50K lies within range of 17-40 age group.

plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
         geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3

plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4

2.3 3D-scatter plot and 2D-density plot

In 3D-scatter plot, the overlapping of data points around the same values makes it difficult to analyse the relationship between given three varaibles. In raster type 2D-density plot, every panels of age group 29-90 exhibits almost same outputs for capital loss ranging 1000-2000 except age group of 17-29.

df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
        add_markers() %>%
        layout(scene = list(xaxis = list(title = 'Education-num'),
                     yaxis = list(title = 'Age'),
                     zaxis = list(title = 'Capital loss')))
plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5

2.4 Trellis plots

The patterns in every panels of trellis plot looks almost similar except the last panel i.e, age group of 48-90. The capital loss seems to be quite high for both age group of 37-48 and 48-90.

plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
  facet_grid(cut_number(df$age,4)~.)
plot6a

Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10% 

L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))

index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

df4<-df[index,]
df4$Class<-as.factor(Class)

ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
  facet_grid(Class~., labeller = "label_both")

Appendix

library(ggplot2)
library(seriation)
library(scales)
library(tidyr)
library(dplyr)
library(lattice)
library(plotly)
prices_earnings <- read.delim("prices-and-earnings.txt")
# keep columns 1,2,5,6,7,9,10,16,17,18,19

p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]

rownames(p_e) <- p_e[[1]]
# question 2 scaling
p_e_sc <- scale(p_e[,-1])

#p_e_sc %>% 
  plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
    z = ~p_e_sc, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")

p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))

order2 <-get_order(seriate(p_e_cdist, method = "OLO"))

p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
    z = ~p_reord, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist - HC)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# computing distance as one minus correlation

p_e_cor <- as.dist((1 - cor(p_e_sc))/2)

p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
set.seed(1212)

# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))

ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))

# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
    z = ~p_reord2, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Cor dist)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)

ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))

set.seed(111)
order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))

p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
    z = ~p_reord_q4, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist- TSP)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order

# set seed 
set.seed(222)

or1 <- seriate(p_e_rdist, method = "OLO")

set.seed(333)

or2 <- seriate(p_e_rdist, method = "TSP")

result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))

result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
# criterion on HC solver and unordered
result1
# criterion on TSP and unordered.
result2
# parallel coordinates plot from unsorted scaled data 

p_e_sc2 <- as.data.frame(p_e_sc)

p_e_sc2 <- round(p_e_sc2, 1)

p_e_sc2 %>% plot_ly(type ="parcoords",
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
 
p_e_sc2 %>% plot_ly(type ="parcoords",
  line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)
# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
p_ord_HC$name <- rownames(p_ord_HC)
# colapse the p_ord_HC to long format
p_ord_HC %>%
  tidyr::gather(variable, value, -name, factor_key = T) %>%
  arrange(name) %>%
  ggplot(aes(x=variable, y=value, group=name)) + 
  geom_polygon(fill="#F81894") + 
  coord_polar() + theme_classic() + 
  facet_wrap(~ name) #+ 
  #theme(axis.text.x = element_text(size = 5))
df <- read.csv("adult.csv")
colnames(df) <- c("age","workclass","fnlwgt","edu","edu_num","marital_status","occupation","relationship","race","sex","cap_gain","cap_loss","hours_per_week","native_country","income_level")
#View(df)
plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
         geom_point(size=1)
plot1

plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
   facet_wrap(~income_level)
plot2

plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
         geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3


plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4
df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
        add_markers() %>%
        layout(scene = list(xaxis = list(title = 'Education-num'),
                     yaxis = list(title = 'Age'),
                     zaxis = list(title = 'Capital loss')))

plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5
plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
  facet_grid(cut_number(df$age,4)~.)
plot6a
Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10% 

L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))

index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

df4<-df[index,]
df4$Class<-as.factor(Class)

ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
  facet_grid(Class~., labeller = "label_both")

References